# This script takes the filtered data, estimates the topic model, and then writes
# the relevant output to a file. We want this step to be separate from everything
# else because we don't want to re-estimate the topic model unncessarily.
library(tidyverse)
library(arrow)
library(stm)
library(tm)
library(quanteda)

filtered_newswhip <- read_parquet("data/processed/filtered_newswhip_sample.parquet")
set.seed(1683767830)

# TODO: decide on stemming.
# TODO: we should filter on stopwords earlier and not rely on the STM for this.
proc <- textProcessor(filtered_newswhip$headline_and_blurb,
                      metadata = filtered_newswhip,
                      stem = TRUE,
                      removestopwords = TRUE)
out <- prepDocuments(proc$documents, proc$vocab, proc$meta, lower.thresh = 10)

# keep one of these commented out
# do a full model sweep - slow
#
models <- selectModel(out$documents,
                      out$vocab, 
                      K = 0, 
                      data = out$meta, 
                      prevalence =~ out$meta$publisher + s(out$meta$day),
                      runs = 75, 
                      max.em.its = 100, 
                      seed = 42,
                      init.type = "Spectral")

#
# plot(models)
target_model <- models$runout[[1]]

# run a single model - fast
# target_model <- stm(out$documents,
#                     out$vocab,
#                     K = 0,
#                     data = out$meta,
#                     prevalence =~ out$meta$publisher + s(out$meta$day),
#                     #prevalence =~ as.factor(out$meta$source.domain),
#                     seed = 42,
#                     max.em.its = 100,
#                     init.type = "Spectral")


save(target_model, filtered_newswhip, out, proc, file=glue::glue("data/topic_model.RData"))
save(models, filtered_newswhip, out, proc, file=glue::glue("data/multiple_topic_models.RData"))
